NIH Ward Lab - Neurons Day8 - Preprocessing QC statistics ¶

July 2025 - Nancy Y¶

Reran by Sagy on Sep 15 (2025) - removing CD41 and FUS lines

In [1]:
import io
import os
import sys
import pandas as pd
import contextlib
from IPython.display import display, Javascript

NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
os.environ['NOVA_HOME'] = NOVA_HOME
sys.path.insert(1, os.getenv("NOVA_HOME"))
print(f"NOVA_HOME: {os.getenv('NOVA_HOME')}")


root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'NIH', 'indi-image-pilot-20241128')
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'ManuscriptFinalData_80pct', 'NIH')

LOGS_PATH = os.path.join(NOVA_HOME, "outputs", "preprocessing", "ManuscriptFinalData_80pct", "NIH", "logs")
PLOT_PATH = os.path.join(NOVA_HOME, 'outputs', 'preprocessing', "ManuscriptFinalData_80pct", 'NIH', 'QC_figures')


from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
                                                show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
                                                show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
                                                calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
                                                plot_cell_count, plot_catplot, plot_hm_of_mean_cell_count_per_tile, \
                                                run_calc_hist_new, show_total_valid_tiles_per_marker_and_batch
                                                
from tools.preprocessing_tools.qc_reports.qc_config import NIH_d8_panels, NIH_d8_markers, NIH_d8_marker_info, NIH_d8_cell_lines, NIH_d8_cell_lines_to_cond,\
                                    NIH_d8_cell_lines_for_disp, NIH_d8_reps, NIH_d8_line_colors, NIH_d8_lines_order, NIH_d8_custom_palette,\
                                    NIH_d8_expected_dapi_raw
%load_ext autoreload
%autoreload 2
NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA
In [2]:
# choose batches
batches = [f'batch{i}' for i in range(1,4)]
batches
Out[2]:
['batch1', 'batch2', 'batch3']
In [3]:
df = log_files_qc(LOGS_PATH, only_wt_cond=False, batches=batches, filename_split='-',site_location=0)

#df['cell_line_cond'] = df['cell_line_cond'].str.replace(" ", "_")
df = df[df['cell_line'] == 'WT']

df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch3
reading logs of batch2
reading logs of batch1

Total of 3 files were read.
Before dup handeling  (124974, 21)
After duplication removal #1: (124974, 22)
After duplication removal #2: (124974, 22)

Actual Files Validation¶

Raw Files Validation¶

  1. How many site tiff files do we have in each folder?
  2. Are all existing files valid? (tif, at least 2049kB, not corrupetd)
In [4]:
raws = run_validate_folder_structure(root_directory_raw, False, 
                                     NIH_d8_panels, 
                                     NIH_d8_markers.copy(),
                                     PLOT_PATH, 
                                     NIH_d8_marker_info,
                                     NIH_d8_cell_lines_to_cond, 
                                     NIH_d8_reps, 
                                     NIH_d8_cell_lines_for_disp,
                                     NIH_d8_expected_dapi_raw,
                                     batches=batches, 
                                     fig_width=8, fig_height = 40,
                                     expected_count=25, check_antibody=False)
batch1
Folder structure is valid.
No bad files are found.
Total Sites:  17200
No description has been provided for this image
========
batch2
Folder structure is valid.
No bad files are found.
Total Sites:  17200
No description has been provided for this image
========
batch3
Folder structure is valid.
No bad files are found.
Total Sites:  17200
No description has been provided for this image
========
====================
In [5]:
## Missing data issue was fixed

differences = (raws[0] != raws[1]).stack()
differences = differences[differences].index.to_frame(index=False)
differences.columns = ["Marker", "Rep", "Condition"]
for condition in differences["Condition"].unique():
    print(f"Condition: {condition}")
    condition_data = differences[differences["Condition"] == condition]
    for rep in condition_data["Rep"].unique():
        markers = condition_data[condition_data["Rep"] == rep]["Marker"].tolist()
        print(f"  Rep: {rep}")
        print(f"    Markers: {', '.join(markers)}")

Processed Files Validation¶

  1. How many site npy files do we have in each folder? -> How many sites survived the pre-processing?
  2. Are all existing files valid? (at least 100kB, npy not corrupted)
In [6]:
procs = run_validate_folder_structure(root_directory_proc, True, 
                                      NIH_d8_panels, 
                                      NIH_d8_markers,
                                      PLOT_PATH,
                                      NIH_d8_marker_info,
                                      NIH_d8_cell_lines_to_cond, 
                                      NIH_d8_reps, 
                                      NIH_d8_cell_lines_for_disp, 
                                      NIH_d8_expected_dapi_raw,
                                      fig_width=8, fig_height=40,
                                      expected_count=25, 
                                      check_antibody=False, 
                                      batches=batches)
batch1
Folder structure is valid.
No bad files are found.
Total Sites:  16575
No description has been provided for this image
========
batch2
Folder structure is valid.
No bad files are found.
Total Sites:  16336
No description has been provided for this image
========
batch3
Folder structure is valid.
No bad files are found.
Total Sites:  16326
No description has been provided for this image
========
====================

Difference between Raw and Processed¶

In [7]:
display_diff(batches, raws, procs, PLOT_PATH, fig_width=8, fig_height=40)
batch1
No description has been provided for this image
========
batch2
No description has been provided for this image
========
batch3
No description has been provided for this image
========

Variance in each batch (of processed files)¶

In [8]:
for batch in batches[:1]:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, 
                                       batch, 
                                       sample_size_per_markers=50, 
                                       cond_count=2, 
                                       rep_count=len(NIH_d8_reps), 
                                       num_markers=len(NIH_d8_markers))
    print(f'{batch} var: ',var)
batch1 var:  0.029910257553058615

Preprocessing Filtering qc¶

By order of filtering

1. % site survival after Brenner on DAPI channel¶

Percentage out of the total sites

In [9]:
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,
                                                         batches, 
                                                         NIH_d8_line_colors, 
                                                         NIH_d8_panels, 
                                                         NIH_d8_reps, 
                                                         figsize=(6,18),
                                                         vmax=25)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

2. % Site survival after Cellpose¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if Cellpose found 0 cells in it.

In [10]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, 
                                                           batches, 
                                                           dapi_filter_by_brenner, 
                                                           NIH_d8_line_colors, 
                                                           NIH_d8_panels, 
                                                           NIH_d8_reps, 
                                                           figsize=(6,18))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

3. % Site survival by tiling¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.

In [11]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, 
                                                     batches, 
                                                     dapi_filter_by_cellpose, 
                                                     NIH_d8_line_colors, 
                                                     NIH_d8_panels, 
                                                     NIH_d8_reps, 
                                                     figsize=(6,18))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

4. % Site survival after Brenner on target channel¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [12]:
show_site_survival_target_brenner(df_dapi, 
                                  df_target, 
                                  dapi_filter_by_tiling, 
                                  NIH_d8_markers,
                                  figsize=(6,18))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Statistics About the Processed Files¶

In [13]:
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, NIH_d8_markers)

Total tiles¶

In [14]:
# markers_for_dnls = markers.copy() #TODO need to change according to - if we use all markers or just the d8 ones!!!!
# markers_for_dnls.remove('TIA1')
# markers_for_dnls += ['TDP43B']

total_sum[total_sum.marker.isin(NIH_d8_markers)].n_valid_tiles.sum()
Out[14]:
590634

Total whole nuclei in tiles¶

In [15]:
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
Out[15]:
110373.0

Total nuclei in sites¶

In [16]:
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
Out[16]:
347157.0
In [17]:
show_total_sum_tables(total_sum)
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch1
count 688.000000 688.000000 688.000000 688.000000
mean 332.860465 3.328605 237.784884 761.949128
std 56.413226 0.564132 50.451934 135.014222
min 80.000000 0.800000 56.000000 155.000000
25% 301.000000 3.010000 207.000000 699.000000
50% 336.000000 3.360000 235.500000 768.000000
75% 369.000000 3.690000 268.250000 852.000000
max 466.000000 4.660000 360.000000 1102.000000
sum 229008.000000 NaN 163596.000000 524221.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch2
count 687.000000 687.000000 687.000000 687.000000
mean 269.903930 2.699039 189.056769 590.714702
std 50.685184 0.506852 38.856257 116.194745
min 4.000000 0.040000 3.000000 11.000000
25% 242.000000 2.420000 167.000000 529.000000
50% 271.000000 2.710000 191.000000 600.000000
75% 299.000000 2.990000 212.000000 657.500000
max 385.000000 3.850000 290.000000 918.000000
sum 185424.000000 NaN 129882.000000 405821.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch3
count 687.000000 687.000000 687.000000 687.0000
mean 256.480349 2.564803 179.742358 556.9869
std 50.003112 0.500031 39.653665 112.6077
min 19.000000 0.190000 7.000000 37.0000
25% 231.000000 2.310000 155.000000 494.5000
50% 265.000000 2.650000 186.000000 578.0000
75% 287.000000 2.870000 207.000000 624.0000
max 378.000000 3.780000 273.000000 818.0000
sum 176202.000000 NaN 123483.000000 382650.0000
expected_count 450.000000 450.000000 450.000000 450.0000
n valid tiles % valid tiles site_whole_cells_counts_sum site_cell_count
All batches
count 2062.000000 2062.000000 2062.000000 2.062000e+03
mean 286.437439 2.864374 202.211930 6.366111e+02
std 62.111166 0.621112 50.227316 1.511637e+02
min 4.000000 0.040000 3.000000 1.100000e+01
25% 253.000000 2.530000 170.000000 5.420000e+02
50% 284.000000 2.840000 201.000000 6.290000e+02
75% 329.000000 3.290000 231.000000 7.330000e+02
max 466.000000 4.660000 360.000000 1.102000e+03
sum 590634.000000 NaN 416961.000000 1.312692e+06
expected_count 450.000000 450.000000 450.000000 4.500000e+02

Show Total Tile Counts¶

For each batch, cell line, replicate and marker: Total number of tiles

First, we look at all cell lines togther:¶

In [18]:
show_total_valid_tiles_per_marker_and_batch(total_sum, vmax=15000)
No description has been provided for this image

Separating into cell lines & batches:¶

In [19]:
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, 
                       extra_index='marker', 
                       vmin=None, vmax=None,
                       xlabel = 'Total number of tiles', 
                       show_sum=True, figsize=(7,28), 
                       fmt=".0f")
No description has been provided for this image
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image
No description has been provided for this image
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image
No description has been provided for this image
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image

Show Total Whole Cell Counts¶

For each batch, cell line, replicate and markerTotal number of tiles

In [20]:
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap, 
                       extra_index='marker', 
                       vmin=None, vmax=None,
                       xlabel = 'Total number of whole cells', 
                       show_sum=True, 
                       figsize=(7,28), 
                       fmt=".0f")
No description has been provided for this image
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image
No description has been provided for this image
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image
No description has been provided for this image
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image

Show Cell Count Statistics per Batch¶

In [21]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]

plot_cell_count(df_no_empty_sites, 
                NIH_d8_lines_order, 
                NIH_d8_custom_palette, 
                y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)', 
                figsize=(16,6))


plot_cell_count(df_no_empty_sites, 
                NIH_d8_lines_order, 
                NIH_d8_custom_palette, 
                y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site',
                figsize=(16,6))


plot_cell_count(df_no_empty_sites, 
                NIH_d8_lines_order, 
                NIH_d8_custom_palette, 
                y='site_cell_count',
                title='Cellpose Cell Count Average per Site',
                figsize=(16,6))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Show Tiles per Site Statistics¶

In [22]:
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
Out[22]:
cell_line_cond
WT Untreated    11.889007
WT stress       11.968573
Name: n_valid_tiles, dtype: float64
In [23]:
# number of valid tiles per site (on average)
import numpy as np
np.mean(df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean())
Out[23]:
11.92879002538194
In [24]:
df_dapi[['site_cell_count']].mean()
Out[24]:
site_cell_count    26.518753
dtype: float64
In [25]:
# plot_catplot(df_dapi, 
#              NIH_d8_custom_palette,
#              NIH_d8_reps, 
#              x='n_valid_tiles', 
#              x_title='valid tiles count', 
#              batch_min=1, 
#              batch_max=3, 
#              height=6)

Show Mean of cell count in valid tiles¶

In [26]:
# plot_hm_of_mean_cell_count_per_tile(df_dapi, 
#                                     split_by='rep', 
#                                     rows='cell_line_cond', 
#                                     columns='panel', 
#                                     figsize=(18,6))
In [27]:
df_dapi[['cells_count_in_valid_tiles_mean']].mean()
Out[27]:
cells_count_in_valid_tiles_mean    1.777164
dtype: float64
In [28]:
df_dapi[['site_cell_count']].mean()
Out[28]:
site_cell_count    26.518753
dtype: float64

Assessing Staining Reproducibility and Outliers¶

In [29]:
# for batch in batches:
#     print(batch)
#     run_calc_hist_new(f'{batch}', dnls_opera_cell_lines_for_disp, dnls_opera_markers,
#                       root_directory_raw, root_directory_proc,
#                            hist_sample=10,sample_size_per_markers=200, ncols=8, nrows=4, dnls=True)
#     print("="*30)
In [30]:
# # save notebook as HTML 
# from IPython.display import display, Javascript
# display(Javascript('IPython.notebook.save_checkpoint();'))
# os.system(f'jupyter nbconvert --to html {NOVA_HOME}/tools/preprocessing_tools/qc_reports/qc_report_NIH_NeuronsDay8.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/ManuscriptFinalData/qc_report_NIH_NeuronsDay8.html')
In [ ]: